import pandas as pdimport plotly.graph_objects as gofrom plotly.subplots import make_subplots# -----------------------------------------------------------------------------# 1) Prepare your data# -----------------------------------------------------------------------------df_grouped = ( eda .groupby(['DATA_ANALYST_JOB','NAICS2_NAME']) .size() .reset_index(name='Job_Count'))short_names = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})pivot = ( df_grouped .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0) .reset_index())industries = pivot['Industry'].tolist()y_true = pivot['True'].tolist()y_false = pivot['False'].tolist()# -----------------------------------------------------------------------------# 2) Build a 2-row subplot: bar on top, table below# -----------------------------------------------------------------------------fig = make_subplots( rows=2, cols=1, row_heights=[0.70, 0.30], # give a bit more room to the table specs=[[{"type":"bar"}],[{"type":"table"}]], vertical_spacing=0.12# more space between bar and table)colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}fig.add_trace( go.Bar( x=industries, y=y_true, name='True', marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)), text=y_true, textposition='outside' ), row=1, col=1)fig.add_trace( go.Bar( x=industries, y=y_false, name='False', marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)), text=y_false, textposition='outside' ), row=1, col=1)fig.add_trace( go.Table( header=dict( values=["Industry","True","False"], fill_color='#FDEDEC', align='left', font=dict(color='#A81D1D', size=13), height=30 ), cells=dict( values=[industries, y_true, y_false], fill_color='white', align='left', font=dict(color='#333', size=11), height=22 ) ), row=2, col=1)# -----------------------------------------------------------------------------# 3) Slider steps: 0 → 8 000 in 200s# -----------------------------------------------------------------------------steps = []for val inrange(0, 8001, 200): steps.append(dict( label=str(val), method="update", args=[ {"y": [ [v if v>=val else0for v in y_true], [v if v>=val else0for v in y_false] ]}, {"title": f"Min Jobs ≥ {val:,}"} ] ))# -----------------------------------------------------------------------------# 4) Final layout tweaks# -----------------------------------------------------------------------------fig.update_layout(# lift slider above everything sliders=[dict( active=0, currentvalue={"prefix":"Min Jobs: "}, pad={"b":0}, x=0.15, y=1.18, # move slider way above the plot area xanchor="left", yanchor="bottom",len=0.7, font=dict(color='#A81D1D'), steps=steps )], title=dict( text="Data & Business Analytics Job Trends", font=dict(size=24, color='#A81D1D'), x=0.5, y=0.92, # drop the title just below the slider xanchor="center", yanchor="top" ), width=1100, height=850, margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin plot_bgcolor='white', paper_bgcolor='white', xaxis=dict( title="Industry", title_font=dict(size=16, color='#A81D1D'), tickmode='array', tickvals=list(range(len(industries))), ticktext=industries, tickangle=-30, tickfont=dict(size=11, color='#333'), showline=True, linecolor='#A81D1D' ), yaxis=dict( title="Number of Jobs", title_font=dict(size=16, color='#A81D1D'), tickfont=dict(size=11, color='#333'), gridcolor='rgba(200,200,200,0.3)', showline=True, linecolor='#A81D1D',range=[0, max(max(y_true),max(y_false))*1.2] ), legend=dict( title="Data Analyst Job", title_font=dict(color='#A81D1D'), font=dict(size=12), x=1.02, y=0.5 ), bargap=0.2)fig.show()
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Create the box plotfig = px.box(df, x='REMOTE_TYPE_NAME', y='SALARY', color='Job_Category', title='Salary Distribution by Remote Type for Analytics vs Non-Analytics Jobs', labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'}, color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Remote Type", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True ), yaxis=dict( title="Salary ($)", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))# Show the plotfig.show()
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Group by industry and job categorydf_grouped = df.groupby(['NAICS2_NAME', 'IS_ANALYTICS_JOB']).size().reset_index(name='Job_Count')df_grouped['Job_Category'] = df_grouped['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Shorten industry names for better readabilityshort_map = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])# Create the stacked bar chartfig = px.bar(df_grouped, x='Industry', y='Job_Count', color='Job_Category', title='Top Industries Hiring Analytics Jobs', labels={'Industry': 'Industry', 'Job_Count': 'Number of Jobs', 'Job_Category': 'Job Category'}, barmode='stack', color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=1000, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Industry", title_font=dict(size=16), tickfont=dict(size=12), tickangle=-45, gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True ), yaxis=dict( title="Number of Jobs", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))# Show the plotfig.show()
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Calculate average years of experiencedf['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) /2# Clean the data (remove rows with missing salary or experience)df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])# Create the scatter plot with trend linefig = px.scatter(df, x='Avg_Years_Experience', y='SALARY', color='Job_Category', trendline='ols', # Add trend line (ordinary least squares) title='Experience Requirements vs Salary for Analytics Jobs', labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'}, color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Average Years of Experience", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), yaxis=dict( title="Salary ($)", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))# Customize scatter pointsfig.update_traces( marker=dict( size=8, opacity=0.7, line=dict(width=1, color="#2D3748") ))# Show the plotfig.show()
---------------------------------------------------------------------------ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 26 23 df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
25# Create the scatter plot with trend line---> 26 fig =px.scatter(df, 27x='Avg_Years_Experience', 28y='SALARY', 29color='Job_Category', 30trendline='ols',# Add trend line (ordinary least squares) 31title='Experience Requirements vs Salary for Analytics Jobs', 32labels={'Avg_Years_Experience':'Average Years of Experience','SALARY':'Salary ($)','Job_Category':'Job Category'}, 33color_discrete_map={'Analytics Job':'#FF6B6B','Non-Analytics Job':'#4ECDC4'}) 35# Beautify the layout with a red-white theme (no gradients) 36 fig.update_layout(
37 width=900,
38 height=600,
(...) 89 )
90 )
File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_chart_types.py:69, in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, subtitle, template, width, height) 14defscatter(
15 data_frame=None,
16 x=None,
(...) 63 height=None,
64 ) -> go.Figure:
65""" 66 In a scatter plot, each row of `data_frame` is represented by a symbol 67 mark in 2D space. 68 """---> 69returnmake_figure(args=locals(),constructor=go.Scatter)
File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:2668, in make_figure(args, constructor, trace_patch, layout_patch) 2665elif args["ecdfnorm"] =="percent":
2666 group = group.with_columns((nw.col(var) / group_sum) *100.0)
-> 2668 patch, fit_results =make_trace_kwargs( 2669args,trace_spec,group,mapping_labels.copy(),sizeref 2670) 2671 trace.update(patch)
2672if fit_results isnotNone:
File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:430, in make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref) 427 trace_patch["x"] = trace_patch["x"].to_numpy()
429 trendline_function = trendline_functions[attr_value]
--> 430 y_out, hover_header, fit_results =trendline_function( 431args["trendline_options"], 432sorted_trace_data.get_column(args["x"]),# narwhals series 433x.to_numpy(),# numpy array 434y.to_numpy(),# numpy array 435args["x"], 436args["y"], 437non_missing.to_numpy(),# numpy array 438) 439assertlen(y_out) ==len(
440 trace_patch["x"]
441 ), "missing-data-handling failure in trendline code" 442 trace_patch["y"] = y_out
File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/trendline_functions/__init__.py:42, in ols(trendline_options, x_raw, x, y, x_label, y_label, non_missing) 36if k notin valid_options:
37raiseValueError(
38"OLS trendline_options keys must be one of [%s] but got '%s'" 39% (", ".join(valid_options), k)
40 )
---> 42importstatsmodels.apiassm 44 add_constant = trendline_options.get("add_constant", True)
45 log_x = trendline_options.get("log_x", False)
ModuleNotFoundError: No module named 'statsmodels'
Code
import plotly.graph_objects as goimport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Filter for Analytics jobs onlydf_analytics = df[df['IS_ANALYTICS_JOB']].copy()# Clean the data (remove rows with missing industry)df_analytics = df_analytics.dropna(subset=['NAICS2_NAME'])# Group by job category and industry to get job countsdf_grouped = df_analytics.groupby(['Job_Category', 'NAICS2_NAME']).size().reset_index(name='Job_Count')# Shorten industry names for better readabilityshort_map = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['NAICS2_NAME'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])# Prepare data for Sankey Diagram# Create a list of unique labels (nodes)labels =list(df_grouped['Job_Category'].unique()) +list(df_grouped['NAICS2_NAME'].unique())# Create source and target indicessource = [labels.index(job_cat) for job_cat in df_grouped['Job_Category']]target = [labels.index(industry) for industry in df_grouped['NAICS2_NAME']]value = df_grouped['Job_Count'].tolist()# Create the Sankey Diagramfig = go.Figure(data=[go.Sankey( node=dict( pad=15, thickness=20, line=dict(color="#2D3748", width=0.5), label=labels, color="#FF6B6B"# Red nodes for the theme ), link=dict( source=source, target=target, value=value, color="rgba(255, 107, 107, 0.5)"# Semi-transparent red links ))])# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( text='Distribution of Analytics Job Postings by Industry', font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), margin=dict(l=20, r=20, t=80, b=20),)# Show the plotfig.show()